gvc_opentargets

Setup environment

library(tidyverse)
library(ggformula)
library(janitor)
library(skimr)
library(broom)
library(readxl)
library(jsonlite)

theme_set(theme_bw())

set.seed(666)

Read and prep data

Genes within 1Mb window of GVC loci:

gvc <- read_xlsx("GVC_1Mb_comparison_050224.xlsx") |>
  clean_names() |> 
  separate(gene_id, c("gene_id", "version")) |>
  select(-version, -agora_nominated_list, -opentarget_info)

gvc

Gene prioritization scores from Agora https://www.synapse.org/Synapse:syn25741025:

ago <- read_json("syn25741025.overall_scores.json", simplifyVector = TRUE) |> as_tibble()

ago
sum(gvc$gene_id %in% ago$ensembl_gene_id)
[1] 2248
sum(gvc$gene_symbol %in% ago$hgnc_symbol)
[1] 2234

Genes from Open Targets:

ot <- read_tsv("OT-MONDO_0004975-associated-targets-5_4_2024-v24_03.tsv", show_col_types = FALSE)

ot
sum(ot$symbol %notin% ago$hgnc_symbol)
[1] 293
library(gprofiler2)
otcols <- colnames(ot)
otensg <- gconvert(
  query = ot$symbol,
  organism = "hsapiens",
  target= "ENSG",
  mthreshold = Inf,
  filter_na = TRUE) |> 
  mutate(input_number = as.character(input_number)) |>
  left_join(ot |> rownames_to_column(var = "input_number"), by = "input_number") |> 
  select(ensembl_gene_id = target, otcols)

otensg
d <- gvc |>
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) |> 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) |> 
  arrange(desc(globalScore))

d
skim(d)
Data summary
Name d
Number of rows 2473
Number of columns 61
_______________________
Column type frequency:
character 51
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
grouped_loci_rs_id_gvc 0 1.00 6 11 0 146 0
grouped_loci_gvc 0 1.00 3 43 0 84 0
chrom 0 1.00 4 5 0 21 0
rsid 0 1.00 6 11 0 146 0
ref 0 1.00 1 1 0 4 0
alt 0 1.00 1 5 0 14 0
gene_id 0 1.00 15 15 0 1344 0
gene_strand 0 1.00 1 1 0 2 0
gene_symbol 0 1.00 3 14 0 1344 0
gene_type 0 1.00 14 14 0 1 0
hgnc_symbol 221 0.91 3 14 0 1212 0
symbol 1649 0.33 3 11 0 404 0
otGeneticsPortal 1649 0.33 7 20 0 196 0
geneBurden 1649 0.33 7 7 0 1 0
eva 1649 0.33 1 19 0 9 0
genomicsEngland 1649 0.33 7 18 0 4 0
gene2Phenotype 1649 0.33 7 7 0 1 0
uniprotLiterature 1649 0.33 7 18 0 3 0
uniprotVariants 1649 0.33 7 18 0 3 0
orphanet 1649 0.33 7 17 0 2 0
clingen 1649 0.33 7 7 0 1 0
cancerGeneCensus 1649 0.33 7 7 0 1 0
intogen 1649 0.33 7 7 0 1 0
evaSomatic 1649 0.33 7 7 0 1 0
cancerBiomarkers 1649 0.33 7 7 0 1 0
chembl 1649 0.33 7 19 0 14 0
crisprScreen 1649 0.33 7 19 0 41 0
crispr 1649 0.33 7 7 0 1 0
slapenrich 1649 0.33 7 7 0 1 0
progeny 1649 0.33 7 7 0 1 0
reactome 1649 0.33 7 17 0 2 0
sysbio 1649 0.33 7 19 0 2 0
europepmc 1649 0.33 7 20 0 43 0
expressionAtlas 1649 0.33 7 20 0 173 0
impc 1649 0.33 7 19 0 7 0
maxClinicalTrialPhase 1649 0.33 1 7 0 4 0
isInMembrane 1649 0.33 1 7 0 3 0
isSecreted 1649 0.33 1 7 0 3 0
hasLigand 1649 0.33 1 7 0 3 0
hasSmallMoleculeBinder 1649 0.33 1 7 0 3 0
hasPocket 1649 0.33 1 7 0 3 0
mouseOrthologMaxIdentityPercentage 1649 0.33 1 20 0 272 0
hasHighQualityChemicalProbes 1649 0.33 1 7 0 3 0
geneticConstraint 1649 0.33 7 22 0 403 0
mouseKoScore 1649 0.33 1 21 0 230 0
geneEssentiality 1649 0.33 1 7 0 3 0
hasSafetyEvent 1649 0.33 2 7 0 2 0
isCancerDriverGene 1649 0.33 2 7 0 2 0
paralogMaxIdentityPercentage 1649 0.33 1 21 0 73 0
tissueSpecificity 1649 0.33 1 7 0 5 0
tissueDistribution 1649 0.33 1 7 0 5 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
chrom_start 0 1.00 62436245.64 48374178.54 413333.00 31121778.00 51224705.00 94979791.00 233173930.00 ▇▆▂▂▁
chrom_end 0 1.00 62436246.64 48374178.54 413334.00 31121779.00 51224706.00 94979792.00 233173931.00 ▇▆▂▂▁
distance 0 1.00 12719.87 278764.16 -499286.00 -218912.00 14939.00 241039.00 499987.00 ▆▇▇▇▇
absolute_distance 0 1.00 238261.90 145188.23 114.00 115708.00 229173.00 361466.00 499987.00 ▇▇▇▇▆
gene_start 0 1.00 62406168.65 48373400.00 87249.00 31131432.00 50825288.00 95244912.00 233671897.00 ▇▆▂▂▁
gene_end 0 1.00 62440568.15 48376158.70 97094.00 31135727.00 50837213.00 95269201.00 233773300.00 ▇▆▂▂▁
target_risk_score 221 0.91 2.46 0.99 0.57 1.57 2.44 3.34 4.71 ▆▇▇▇▂
genetics_score 221 0.91 1.66 0.46 0.57 1.28 1.64 2.00 2.90 ▂▇▇▆▁
multi_omics_score 465 0.81 0.90 0.76 0.00 0.00 1.01 1.54 2.00 ▇▂▂▅▅
globalScore 1649 0.33 0.13 0.16 0.00 0.01 0.06 0.23 0.84 ▇▂▁▁▁
query <- ot |> arrange(desc(globalScore)) |> distinct(symbol) |> pull(symbol)

gostres <- gost(query = query,
                organism = "hsapiens",
                domain_scope = "annotated",
                exclude_iea = TRUE,
                ordered_query = TRUE,
                significant = TRUE,
                user_threshold = 0.005,
                correction_method = "fdr")

gostres$result
gostplot(gostres, capped = FALSE, interactive = TRUE)
otensg |>
  left_join(ago, by = "ensembl_gene_id") |>
  slice_max(globalScore, n = 100) |>
  select(ensembl_gene_id, symbol, globalScore, target_risk_score, genetics_score, multi_omics_score) |>
  drop_na(globalScore, target_risk_score) |>
  summarize(cor = tidy(cor.test(globalScore, target_risk_score, method="spearman"))) |>
  unnest(cor)